# Pandas is used for data manipulation
import pandas as pd
# Read in data and display first 5 rows
df1 = pd.read_csv('random_forest_data.csv')
df1.head()
df1.isnull().sum()
print('The shape of our features is:', df1.shape)
import numpy as np
df1['analysis_url'].unique()
# convert string feature(song_title and artist) to int64
df1["analysis_url"] = pd.factorize(df1["analysis_url"])[0].astype(np.int64)
df1["id"] = pd.factorize(df1["id"])[0].astype(np.int64)
df1["track_href"] = pd.factorize(df1["track_href"])[0].astype(np.int64)
df1["type"] = pd.factorize(df1["type"])[0].astype(np.int64)
df1["uri"] = pd.factorize(df1["uri"])[0].astype(np.int64)
df1.drop(columns="Unnamed: 0")
y = df1.valence
# One-hot encode the data using pandas get_dummies
#df1 = pd.get_dummies(df1)
# Display
#df1.head()
#split dataset in features and target variable
#removed artist and song_title because all values are unique,
# Use numpy to convert to arrays
import numpy as np
# Labels are the values we want to predict
y = df1['valence']
features = ["acousticness", "analysis_url","danceability", "duration_ms", "energy","id", "instrumentalness", "key", "liveness", "loudness", "mode", "speechiness", "tempo", "time_signature","track_href","type","uri"]
target = ["valence"]
# Remove the labels from the features
# axis 1 refers to the columns
X= df1[features]
#X = df1.drop(columns="Unnamed: 0")
print(X)
print(y)
from sklearn.model_selection import train_test_split # Import train_test_split function
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 20, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train)
rf.score(X_test, y_test)
y_pred = rf.predict(X_test)
print(y_pred)
for i in range(1,20):
model = RandomForestRegressor(n_estimators=i)
model.fit(X_train, y_train)
print("Model score for no of trees",i," is : ",model.score(X_test, y_test))
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
from sklearn.tree import export_graphviz
features = ["acousticness", "analysis_url","danceability", "duration_ms", "energy","id", "instrumentalness", "key", "liveness", "loudness", "mode", "speechiness", "tempo", "time_signature","track_href","type","uri"]
target = ["valence"]
estimator = rf.estimators_[5]
export_graphviz(estimator, out_file='tree.dot',
feature_names = features,
class_names = target,
rounded = True, proportion = False,
precision = 2, filled = True)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
from IPython.display import Image
Image(filename = 'tree.png')
Random sampling of data points, combined with random sampling of a subset of the features at each node of the tree, is why the model is called a ‘random’ forest.
Furthermore, notice that in our tree, there are only 2 variables we actually used to make a prediction! According to this particular decision tree, the rest of the features are not important for making a prediction. Month of the year, day of the month, and our friend’s prediction are utterly useless for predicting the maximum temperature tomorrow! The only important information according to our simple tree is the temperature 1 day prior and the historical average. Visualizing the tree has increased our domain knowledge of the problem, and we now know what data to look for if we are asked to make a prediction!